egg_data <- read.csv("eggs_price_2025.csv")
colnames(egg_data) <- c("Date", "Price")
plot_ly(egg_data, x = ~as.Date(Date)) %>%
add_lines(y = ~Price, name = 'Egg prices (USD)', line = list(color = 'blue')) %>%
layout(title = "Time series plot in Egg Prices",
xaxis = list(title = "Date"),
yaxis = list(title = "Egg Prices"))Review Midterm
The questions will be the same as the sample exam, but the dataset is eggs_price data-set in lab4’s folder.
ts_egg <- ts(egg_data$Price,
start = decimal_date(as.Date(min(egg_data$Date))),
frequency = 12) #Monthly dataa
plot(ts_egg, main = "Time Series plot")Checking for stationary
f1 <- ggAcf(ts_egg, 50) +
ggtitle("ACF plot without differencing") +
theme_minimal()
f1The ACF decays slowly to 0, which is a typical sign of non-stationary. The ADF Test below indicating the same.
tseries::adf.test(ts_egg)
Augmented Dickey-Fuller Test
data: ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary
Lag plot:
gglagplot(ts_egg, do.lines = FALSE) +
ggtitle("Lag Plot of Original Egg Prices") +
theme_minimal()From lag 1-4, we are able to see a clear autocorrelation.The lag plot does not show a consistent of autocorrelation.
require(gridExtra)Loading required package: gridExtra
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
plot1<-autoplot(ts_egg, main="without the log transformation")
plot2<-autoplot(log(ts_egg), main="log transformed data")
grid.arrange(plot1, plot2,nrow=2)There is no such big difference, so we do not need log transformation in this case.
tseries::adf.test(ts_egg)
Augmented Dickey-Fuller Test
data: ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary
Not stationary, so take the differencing
fig1 <- ggAcf(ts_egg, 50) +
ggtitle("ACF of egg's price time series") +
theme_minimal()
fig1Non-stationary
library(patchwork)Warning: package 'patchwork' was built under R version 4.2.3
diff <- diff(ts_egg)
fig_diff1 <- ggAcf(diff, 50) +
ggtitle("ACF For differenced egg price") +
theme_minimal() # q = 0, 1, 2
fig_diff2_pacf <- ggPacf(diff, 50) +
ggtitle("PACF For differenced egg price")+
theme_minimal() # p = 0, 1, 2, 3
diff_2 <- diff(diff, lag = 12) # D = 1
fig_diff2_acf <- ggAcf(diff_2, 50) +
ggtitle("ACF for seasonal differencing")+
theme_minimal() # Q = 1 or Q = 0
fig_diff2_pacf <- ggPacf(diff_2, 50) +
ggtitle("PACF for seasonal differencing") +
theme_minimal() # P = 1, 2, 3, 4
(fig_diff1 | fig_diff2_pacf) / (fig_diff2_acf | fig_diff2_pacf)q = 0, 1, 2 = 1:2 p = 0, 1, 2, 3 = 1:3 d = 1 D = 1 Q = 0, 1 = 1:2 P = 1, 2, 3, 4 = 1:4
######################## Check for different combinations ########
#write a funtion
SARIMA.c=function(p1,p2,q1,q2,P1,P2,Q1,Q2,data){
#K=(p2+1)*(q2+1)*(P2+1)*(Q2+1)
temp=c()
d=1
D=1
s=12
i=1
temp= data.frame()
ls=matrix(rep(NA,9*35),nrow=35)
for (p in p1:p2)
{
for(q in q1:q2)
{
for(P in P1:P2)
{
for(Q in Q1:Q2)
{
if(p+d+q+P+D+Q<=9)
{
model<- Arima(data,order=c(p-1,d,q-1),seasonal=c(P-1,D,Q-1))
ls[i,]= c(p-1,d,q-1,P-1,D,Q-1,model$aic,model$bic,model$aicc)
i=i+1
#print(i)
}
}
}
}
}
temp= as.data.frame(ls)
names(temp)= c("p","d","q","P","D","Q","AIC","BIC","AICc")
temp
}q = 0, 1, 2 = 1:2 p = 0, 1, 2, 3 = 1:3 d = 1 D = 1 Q = 0, 1 = 1:2 P = 1, 2, 3, 4 = 1:4
output=SARIMA.c(p1=1,p2=3,q1=1,q2=2,P1=1,P2=4,Q1=1,Q2=2,data=ts_egg)
#output
#knitr::kable(output)
output[which.min(output$AIC),] p d q P D Q AIC BIC AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789
output[which.min(output$BIC),] p d q P D Q AIC BIC AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789
output[which.min(output$AICc),] p d q P D Q AIC BIC AICc
22 2 1 0 0 1 1 -689.8555 -672.7867 -689.7789
It find out ARIMA(2, 1, 0)(0, 1, 1)[12] is the best model.
auto.arima(ts_egg)Series: ts_egg
ARIMA(4,1,1)(0,0,2)[12] with drift
Coefficients:
ar1 ar2 ar3 ar4 ma1 sma1 sma2 drift
0.9745 0.0595 -0.0021 -0.1609 -0.9473 0.1057 0.1498 0.0045
s.e. 0.0506 0.0609 0.0617 0.0459 0.0297 0.0489 0.0572 0.0028
sigma^2 = 0.0155: log likelihood = 361.64
AIC=-705.27 AICc=-704.93 BIC=-666.66
Auto ARIMA THINKs ARIMA(4, 1, 1)(0, 0, 2)[12] is the best model.
Diagnose:
# Set seed for reproducibility
set.seed(123)
# ------------------- Fit SARIMA Models & Extract Key Output Dynamically -------------------
# Model 1: SARIMA(2,1,0)(0,1,1)[12]
model_output1 <- capture.output(sarima(ts_egg, 2,1,0, 0,1,1,12))# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line1 <- grep("Coefficients", model_output1)
end_line1 <- length(model_output1)
# Print the relevant section automatically
cat(model_output1[start_line1:end_line1], sep = "\n")Coefficients:
Estimate SE t.value p.value
ar1 0.0590 0.0434 1.3601 0.1744
ar2 0.1413 0.0436 3.2421 0.0013
sma1 -0.9292 0.0237 -39.1341 0.0000
sigma^2 estimated as 0.01488395 on 524 degrees of freedom
AIC = -1.309024 AICc = -1.308937 BIC = -1.276635
# ------------------- Model 2: SARIMA(4,1,1)(0,0,2)[12] -------------------
model_output2 <- capture.output(sarima(ts_egg, 4,1,1, 0,0,2,12))# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line2 <- grep("Coefficients", model_output2)
end_line2 <- length(model_output2)
# Print the relevant section automatically
cat(model_output2[start_line2:end_line2], sep = "\n")Coefficients:
Estimate SE t.value p.value
ar1 0.9745 0.0506 19.2578 0.0000
ar2 0.0595 0.0609 0.9762 0.3294
ar3 -0.0021 0.0617 -0.0340 0.9729
ar4 -0.1609 0.0459 -3.5025 0.0005
ma1 -0.9473 0.0297 -31.9322 0.0000
sma1 0.1057 0.0489 2.1644 0.0309
sma2 0.1498 0.0572 2.6177 0.0091
constant 0.0045 0.0028 1.6052 0.1090
sigma^2 estimated as 0.01527376 on 531 degrees of freedom
AIC = -1.30848 AICc = -1.307976 BIC = -1.236852
ARIMA(2, 1, 0)(0, 1, 1)[12] is the best, since:
The Residual Plot shows nearly consistent fluctuation around zero, however, after 2020, the residuals fluctate more than the previous years, but still around 0, we could say that the residual is stationary.
The Autocorrelation Function (ACF) of the residuals reveals significant autocorrelations around lag 1.5 and high order lags, which are too high to include in the model.
The Q-Q Plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails, which is typical in time series data.
The P-value plots showing lots of insignificance from lag = 0 to lag = 16. AR2, AR3 are not significant in this case.
# ------------------- Fit SARIMA Models & Extract Key Output Dynamically -------------------
# Model 1: SARIMA(2,1,0)(0,1,1)[12]
model_output1 <- capture.output(sarima(ts_egg, 2,1,0, 0,1,1,12))# Find the relevant line numbers dynamically based on the keyword "Coefficients"
start_line1 <- grep("Coefficients", model_output1)
end_line1 <- length(model_output1)
# Print the relevant section automatically
cat(model_output1[start_line1:end_line1], sep = "\n")Coefficients:
Estimate SE t.value p.value
ar1 0.0590 0.0434 1.3601 0.1744
ar2 0.1413 0.0436 3.2421 0.0013
sma1 -0.9292 0.0237 -39.1341 0.0000
sigma^2 estimated as 0.01488395 on 524 degrees of freedom
AIC = -1.309024 AICc = -1.308937 BIC = -1.276635
Fit the model:
The Residual Plot shows nearly consistent fluctuation around zero, however, after 2020, the residuals fluctate more than the previous years, but still around 0, we could say that the residual is stationary.
The Autocorrelation Function (ACF) of the residuals reveals significant autocorrelations around lag 0 and high order lags, which are too high to include in the model.
The Q-Q Plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails, which is typical in time series data.
except ar1 is not significant, all other parameters are significant.
fit <- Arima(ts_egg, order=c(2,1,0), seasonal=list(order=c(0,1,1),period=12))
summary(fit)Series: ts_egg
ARIMA(2,1,0)(0,1,1)[12]
Coefficients:
ar1 ar2 sma1
0.0590 0.1413 -0.9292
s.e. 0.0434 0.0436 0.0237
sigma^2 = 0.01497: log likelihood = 348.93
AIC=-689.86 AICc=-689.78 BIC=-672.79
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.003310636 0.1205224 0.07092409 -0.03535937 4.723159 0.2889723
ACF1
Training set -0.01120366
library(ggplot2)
library(forecast)
library(tsibble)Registered S3 method overwritten by 'tsibble':
method from
as_tibble.grouped_df dplyr
Attaching package: 'tsibble'
The following object is masked from 'package:zoo':
index
The following object is masked from 'package:lubridate':
interval
The following objects are masked from 'package:base':
intersect, setdiff, union
library(dplyr)
# Convert ts_egg to a data frame with proper dates
start_date <- as.Date("1980-01-01") # Adjust based on actual data
date_seq <- seq(start_date, by = "month", length.out = length(ts_egg))
ts_egg_df <- data.frame(
Date = date_seq,
Price = as.numeric(ts_egg)
)
# Fit ARIMA model
fit <- auto.arima(ts_egg)
# Generate forecasts
mean_forecast <- forecast(meanf(ts_egg, h = 12))
naive_forecast <- forecast(naive(ts_egg, h = 12))
snaive_forecast <- forecast(snaive(ts_egg, h = 12))
drift_forecast <- forecast(rwf(ts_egg, h = 12, drift = TRUE))
arima_forecast <- forecast(fit, h = 12)
# Convert forecasts to data frame
# Extract forecasted values properly
forecast_df <- data.frame(
Date = seq(max(ts_egg_df$Date) %m+% months(1), by = "month", length.out = 12),
Mean = as.numeric(mean_forecast$mean), # Extract mean forecast
Naive = as.numeric(naive_forecast$mean),
SNaive = as.numeric(snaive_forecast$mean),
Drift = as.numeric(drift_forecast$mean),
ARIMA = as.numeric(arima_forecast$mean)
)
# Plot with ggplot2
ggplot(ts_egg_df, aes(x = Date, y = Price)) +
geom_line(color = "black", size = 1) + # Actual Data
geom_line(data = forecast_df, aes(x = Date, y = Mean, color = "Mean"), size = 1, linetype = "dashed") +
geom_line(data = forecast_df, aes(x = Date, y = Naive, color = "Naïve"), size = 1, linetype = "dotted") +
geom_line(data = forecast_df, aes(x = Date, y = SNaive, color = "Seasonal Naïve"), size = 1, linetype = "dotdash") +
geom_line(data = forecast_df, aes(x = Date, y = Drift, color = "Drift"), size = 1, linetype = "twodash") +
geom_line(data = forecast_df, aes(x = Date, y = ARIMA, color = "ARIMA Fit"), size = 1) +
ggtitle("Egg Price Forecast") +
xlab("Year") + ylab("Egg Price") +
scale_color_manual(values = c("Mean" = "blue", "Naïve" = "red",
"Seasonal Naïve" = "green", "Drift" = "orange",
"ARIMA Fit" = "purple")) +
guides(colour = guide_legend(title = "Forecast Methods")) +
theme_minimal()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
accuracy(snaive(ts_egg, h=12)) ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.05290152 0.4430971 0.2454356 0.7127024 15.07352 1 0.9127782
accuracy(fit) # Fit is better ME RMSE MAE MPE MAPE MASE
Training set -0.000116345 0.1234726 0.07603313 -0.5693686 5.16475 0.3097885
ACF1
Training set -0.0008840435
fit %>% forecast(h=36) %>% autoplot() #next 3 yearssarima.for(ts_egg, 36, 2,1,0,0,1,1,12)$pred
Jan Feb Mar Apr May Jun Jul Aug
2025 4.234900 4.275501 4.230793 4.265863 4.151832 4.091748 4.126341 4.176166
2026 4.480939 4.472025 4.416975 4.444440 4.328499 4.267228 4.301480 4.351118
2027 4.655785 4.646871 4.591820 4.619285 4.503345 4.442073 4.476326 4.525963
Sep Oct Nov Dec
2025 4.250964 4.231531 4.276110 4.444554
2026 4.425857 4.406394 4.450963 4.619402
2027 4.600702 4.581239 4.625808 4.794247
$se
Jan Feb Mar Apr May Jun Jul
2025 0.1220128 0.1777180 0.2305567 0.2744788 0.3135397 0.3484406 0.3803092
2026 0.5357042 0.5595878 0.5828422 0.6052431 0.6268924 0.6478266 0.6681120
2027 0.7806826 0.7995263 0.8182012 0.8364897 0.8544247 0.8719971 0.8892279
Aug Sep Oct Nov Dec
2025 0.4097385 0.4372126 0.4630646 0.4875507 0.5108653
2026 0.6878008 0.7069425 0.7255796 0.7437500 0.7614868
2027 0.9061322 0.9227277 0.9390302 0.9550545 0.9708141